{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FMA: A Dataset For Music Analysis\n",
    "\n",
    "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n",
    "\n",
    "## Analysis\n",
    "\n",
    "All numbers and figures which appear in the [paper] and much more.\n",
    "\n",
    "[paper]: https://arxiv.org/abs/1612.01840"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import IPython.display as ipd\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "\n",
    "import utils\n",
    "\n",
    "sns.set_context(\"notebook\", font_scale=1.5)\n",
    "plt.rcParams['figure.figsize'] = (17, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tracks = utils.load('tracks.csv')\n",
    "genres = utils.load('genres.csv')\n",
    "features = utils.load('features.csv')\n",
    "echonest = utils.load('echonest.csv')\n",
    "\n",
    "np.testing.assert_array_equal(features.index, tracks.index)\n",
    "assert echonest.index.isin(tracks.index).all()\n",
    "\n",
    "tracks.shape, genres.shape, features.shape, echonest.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1 Size\n",
    "\n",
    "Todo:\n",
    "* When are tracks mostly added.\n",
    "* Which tracks got deleted."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('{} tracks, {} artists, {} albums, {} genres'.format(\n",
    "    len(tracks), len(tracks['artist', 'id'].unique()),\n",
    "    len(tracks['album', 'id'].unique()),\n",
    "    sum(genres['#tracks'] > 0)))\n",
    "mean_duration = tracks['track', 'duration'].mean()\n",
    "print('track duration: {:.0f} days total, {:.0f} seconds average'.format(\n",
    "    sum(tracks['track', 'duration']) / 3600 / 24,\n",
    "    mean_duration))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dimensionality = mean_duration * 44000 * 2\n",
    "print('sample dimensionality: {:.1e}'.format(dimensionality))\n",
    "print('total size, i.e. number of audio samples: {:.1e}'.format(dimensionality * len(tracks)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for subset in tracks['set', 'subset'].unique():\n",
    "    indicator = tracks['set', 'subset'] <= subset\n",
    "    print('{:6} {:6} tracks  {:.1f} days'.format(\n",
    "        subset, sum(indicator), sum(indicator) * 30 / 3600 / 24))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('{} deleted tracks (largest track_id is {})'.format(tracks.index.max() - len(tracks), tracks.index.max()))\n",
    "print('First track: {}'.format(tracks['track', 'date_created'].min()))\n",
    "\n",
    "d = pd.DataFrame(tracks.index, index=tracks['track', 'date_created'].values)\n",
    "d['indicator'] = 1\n",
    "\n",
    "fig, ax1 = plt.subplots()\n",
    "ax2 = ax1.twinx()\n",
    "\n",
    "d['track_id'].plot(ax=ax1)\n",
    "d['indicator'].cumsum().plot(ax=ax1)\n",
    "ax1.set_ylabel('#tracks')\n",
    "ax1.set_ylim(0, 160000)\n",
    "\n",
    "(d['indicator'] * -100).plot(ax=ax2, style='r')  # needed for no apparent reason\n",
    "color = sns.color_palette('deep', 3)[2]\n",
    "d['indicator'].resample('2M').sum().fillna(0).plot(ax=ax2, style='--', color=color)\n",
    "ax2.set_ylabel('#tracks added')\n",
    "ax2.set_ylim(500, 4500)\n",
    "ax2.set_ylim(0, 4000)\n",
    "ax2.grid(False)\n",
    "\n",
    "lns = ax1.get_lines() + [ax2.get_lines()[1]]\n",
    "ax1.legend(lns, ['largest track id', '#tracks still present', '#tracks added per 2 months'], loc='lower right')\n",
    "\n",
    "plt.savefig('growth.pdf')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.1 Splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SPLITS = ['training', 'validation', 'test']\n",
    "SUBSETS = ['small', 'medium', 'large']\n",
    "print('subset    #train    #val   #test  val_ratio test_ratio')\n",
    "for subset in SUBSETS:\n",
    "    counts = [sum((tracks['set', 'split'] == split) & (tracks['set', 'subset'] <= subset)) for split in SPLITS]\n",
    "    ratios = np.array(counts[0] / counts[1:])\n",
    "    print('{:8s} {:7d} {:7d} {:7d} {:8.2f} {:9.2f}'.format(subset, *counts, *ratios))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for subset in ['small', 'medium']:\n",
    "    subset = tracks['set', 'subset'] <= subset\n",
    "\n",
    "    genre_top = tracks.loc[subset, ('track', 'genres_top')].map(lambda x: x[0])\n",
    "    d = genres.loc[genre_top.unique()]\n",
    "\n",
    "    for split in SPLITS:\n",
    "        b = tracks.loc[subset, ('set', 'split')] == split\n",
    "        d['#' + split] = genre_top[b].value_counts()\n",
    "\n",
    "    d['val_ratio'] = d['#training'] / d['#validation']\n",
    "    d['test_ratio'] = d['#training'] / d['#test']\n",
    "\n",
    "    ipd.display(d.sort_values('#training', ascending=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = pd.DataFrame(index=genres.index, columns=SPLITS)\n",
    "for genre in genres.index:\n",
    "    b = tracks['track', 'genres_all'].map(lambda genres: genre in genres)\n",
    "    d.loc[genre] = tracks.loc[b, ('set', 'split')].value_counts()\n",
    "d['val_ratio'] = d['training'] / d['validation']\n",
    "d['test_ratio'] = d['training'] / d['test']\n",
    "d.sort_values('training', ascending=False, inplace=True)\n",
    "ipd.display(d.head(10))\n",
    "ipd.display(d.tail(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2 Metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def isnull(column, df=tracks):\n",
    "    if column[1] in ['tags', 'genres', 'genres_all']:\n",
    "        return df[column].apply(lambda x: len(x) == 0)\n",
    "    elif df.dtypes[column] == np.int:\n",
    "        return df[column] <= 0\n",
    "    else:\n",
    "        return df[column].isnull()\n",
    "\n",
    "def count(series):\n",
    "    col0 = series.name[0]\n",
    "    df = tracks if col0 == 'track' else tracks.drop_duplicates((col0, 'id'))\n",
    "    n = (~isnull(series.name, df)).sum()\n",
    "    p = n / len(df) * 100\n",
    "    return n, p\n",
    "\n",
    "# Columns / metadata usage across dataset.\n",
    "d = pd.DataFrame(index=tracks.columns.drop('set'), columns=['n', 'p'])\n",
    "d = d.apply(count, axis=1)\n",
    "d['n'] = d['n'].astype(np.int)\n",
    "d"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Excerpt as example in the paper.\n",
    "columns = [\n",
    "    ('track', 'title'),\n",
    "    ('track', 'genres'),\n",
    "    ('track', 'genres_all'),\n",
    "    ('track', 'genres_top'),\n",
    "    ('track', 'duration'),\n",
    "    ('track', 'listens'),\n",
    "    ('album', 'title'),\n",
    "    ('album', 'listens'),\n",
    "    ('album', 'tags'),\n",
    "    ('artist', 'name'),\n",
    "    ('artist', 'longitude'),\n",
    "    ('artist', 'latitude'),\n",
    "]\n",
    "\n",
    "non_null = ~isnull(columns[0])\n",
    "for column in columns[1:]:\n",
    "    non_null &= ~isnull(column)\n",
    "tids = np.random.RandomState(42).permutation(tracks.index[non_null])[:8]\n",
    "\n",
    "tracks.loc[tids, columns].head()\n",
    "\n",
    "#tracks.loc[tids, columns].to_latex('tracks.tex', formatters={\n",
    "#    ('artist', 'longitude'): '{:,.1f}'.format,\n",
    "#    ('artist', 'latitude'): '{:,.1f}'.format,\n",
    "#})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tracks['track', 'license'].value_counts().head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tracks['track', 'language_code'].value_counts().head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1 Technical data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "durations = tracks['track', 'duration']\n",
    "plt.figure(figsize=(10, 4))  # Poster: (7, 3)\n",
    "p = sns.distplot(durations[durations.values < 800], kde=False, rug=False, color='k', hist_kws=dict(alpha=0.4))\n",
    "p.set_xlabel('duration [seconds]')\n",
    "p.set_ylabel('#tracks')\n",
    "p.set_xlim(0, 800)  # Poster: 500\n",
    "plt.tight_layout()\n",
    "plt.savefig('duration_distribution.pdf')\n",
    "\n",
    "durations.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Uncommon bit rates are VBR encodings.\n",
    "print('Common bit rates: {}'.format(tracks['track', 'bit_rate'].value_counts().head(5).index.tolist()))\n",
    "print('Average bit rate: {:.0f} kbit/s'.format(tracks['track', 'bit_rate'].mean()/1000))\n",
    "p = sns.distplot(tracks['track', 'bit_rate'], kde=False, rug=False)\n",
    "p.set_xlabel('bit rate')\n",
    "p.set_ylabel('#tracks');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 User data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tags.\n",
    "d1 = tracks['track', 'tags'].apply(len)\n",
    "d2 = tracks.drop_duplicates(('album', 'id'))\n",
    "d2 = d2['album', 'tags'].apply(len)\n",
    "d3 = tracks.drop_duplicates(('artist', 'id'))\n",
    "d3 = d3['artist', 'tags'].apply(len) - 1\n",
    "\n",
    "labels = ['track', 'album', 'artist']\n",
    "for l, d in zip(labels, [d1, d2, d3]):\n",
    "    print('{}: from {} to {} tags'.format(l, max(d.min(), 0), d.max()))\n",
    "\n",
    "MAX = 13  # Poster: 11\n",
    "fig, ax1 = plt.subplots(figsize=(10, 4))  # Poster: (7, 3)\n",
    "ax2 = ax1.twinx()\n",
    "\n",
    "ax1.hist(d1, bins=np.arange(MAX)+0.25, rwidth=0.2, color='C0', label=labels[0])\n",
    "ax2.hist(d2, bins=np.arange(MAX)+0.50, rwidth=0.2, color='C1', label=labels[1])\n",
    "ax2.hist(d3, bins=np.arange(MAX)+0.75, rwidth=0.2, color='C2', label=labels[2])\n",
    "\n",
    "ax1.set_xlabel('#tags')\n",
    "ax1.set_ylabel('#tracks')\n",
    "ax2.set_ylabel('#artists   /   #albums')\n",
    "ax1.set_xlim(0.5, MAX-0.5)\n",
    "ax1.set_xticks(range(1, MAX))\n",
    "ax1.set_ylim(0, 5000)\n",
    "ax2.set_ylim(0, 500)\n",
    "ax1.legend(loc='upper center')\n",
    "ax2.legend(loc='upper right')\n",
    "ax2.grid(False)\n",
    "\n",
    "fig.tight_layout()\n",
    "fig.savefig('tag_distribution.pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# One artist tag is often the artist name.\n",
    "col = 'artist'\n",
    "d = tracks.drop_duplicates((col, 'id'))\n",
    "d.loc[d[col, 'tags'].apply(len) > 0, [('artist', 'name'), (col, 'tags')]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Listens, favorites, comments.\n",
    "\n",
    "def plot(col0, col1, maxval, subplot=None):\n",
    "    if col0 == 'track':\n",
    "        d = tracks['track']\n",
    "    if col0 in ['artist', 'album']:\n",
    "        d = tracks[col0].drop_duplicates('id')\n",
    "    if subplot:\n",
    "        plt.subplot(subplot)\n",
    "    d = d[col1]\n",
    "    p = sns.distplot(d[d.values < maxval], kde=False, color='k', hist_kws=dict(alpha=0.4))\n",
    "    p.set_xlim(-1, maxval)\n",
    "    p.set_xlabel('#' + col1)\n",
    "    p.set_ylabel('#' + col0 + 's')\n",
    "\n",
    "plt.figure(figsize=(17, 10))\n",
    "plot('track', 'listens', 10e3, 221)\n",
    "plot('track', 'interest', 10e3, 222)\n",
    "plot('track', 'favorites', 100, 223)\n",
    "plot('track', 'comments', 20, 224)\n",
    "\n",
    "plt.figure(figsize=(17, 10))\n",
    "plot('album', 'listens', 100e3, 221)\n",
    "plot('album', 'favorites', 100, 223)\n",
    "plot('album', 'comments', 20, 224)\n",
    "\n",
    "plt.figure(figsize=(17, 5))\n",
    "plot('artist', 'favorites', 100, 121)\n",
    "plot('artist', 'comments', 20, 122)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Same as above, formated for the paper.\n",
    "plt.figure(figsize=(10, 4))  # Poster: (7, 3)\n",
    "plot('album', 'listens', 40e3)  # Poster 20e3\n",
    "plt.tight_layout()\n",
    "plt.savefig('listens_distribution.pdf')\n",
    "\n",
    "tracks['album', 'listens'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Most listened albums.\n",
    "tracks['album'].groupby('id').first().sort_values('listens', ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.3 Dates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot(col0, col1):\n",
    "    if col0 == 'track':\n",
    "        d = tracks['track']\n",
    "    if col0 in ['artist', 'album']:\n",
    "        d = tracks[col0].drop_duplicates('id')\n",
    "    d = pd.Series(1, index=d[col1])\n",
    "    d.resample('A').sum().fillna(0).plot()\n",
    "\n",
    "plt.figure()\n",
    "plot('track', 'date_recorded')\n",
    "plot('album', 'date_released')\n",
    "\n",
    "plt.figure()\n",
    "plot('artist', 'active_year_begin')\n",
    "plot('artist', 'active_year_end')\n",
    "\n",
    "plt.figure()\n",
    "plot('track', 'date_created')\n",
    "plot('album', 'date_created')\n",
    "plot('artist', 'date_created')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Same as above, formated for the paper.\n",
    "plt.figure(figsize=(5, 4))\n",
    "d = tracks['album'].drop_duplicates('id')\n",
    "d = pd.Series(1, index=d['date_released'])\n",
    "d = d.resample('A').sum().fillna(0)\n",
    "b = d.index >= pd.to_datetime(1990, format='%Y')\n",
    "b &= d.index <= pd.to_datetime(2017, format='%Y')\n",
    "d[b].plot(color='k')\n",
    "plt.xlabel('release year')\n",
    "plt.ylabel('#albums')\n",
    "plt.tight_layout()\n",
    "plt.savefig('album_release_year.pdf')\n",
    "\n",
    "d.index.min().year, d.index.max().year"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3 Artists & albums effect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for effect in ['artist', 'album']:\n",
    "    d = tracks[effect, 'id'].value_counts()\n",
    "    ipd.display(d.head(5))\n",
    "    p = sns.distplot(d[(d.values < 50) & (d.values >= 0)], kde=False)\n",
    "    p.set_xlabel('#tracks per ' + effect);\n",
    "    p.set_ylabel('#' + effect + 's');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "counts = pd.Series(index=genres.index[genres['parent'] == 0].values, name='#artists')\n",
    "for genre in counts.index:\n",
    "    b = tracks['track', 'genres_top'].map(lambda genres: genre in genres)\n",
    "    counts[genre] = len(tracks.loc[b, ('artist', 'id')].unique())\n",
    "counts.index = list(genres.loc[counts.index, 'title'])\n",
    "counts.sort_values(ascending=False).plot.bar()\n",
    "plt.ylabel('#artists');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4 Genres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = set(tracks['track', 'genres_top'].map(lambda x: x[0] if len(x) > 0 else -1).unique()).difference([-1])\n",
    "b = set(genres.loc[genres['top_level'].unique()].index)\n",
    "assert a == b\n",
    "\n",
    "print('{} top-level genres'.format(len(a)))\n",
    "genres[genres['parent'] == 0].sort_values('#tracks', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of genres per track:\n",
    "* `genres`: they have introduced a [limit of 3 genres per track](https://twitter.com/therewasaguy/status/863426542075953152) early on.\n",
    "* `genres_all`: more genres per track as all coarser genres in the hierarchy are included. E.g. an Indie-Rock song is counted as a Rock song too."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Genres per track.\n",
    "labels = ['genres', 'genres_all', 'genres_top']\n",
    "d = [tracks['track', label].map(len) for label in labels]\n",
    "labels = ['{}\\nmax: {}'.format(label, d1.max()) for label, d1 in zip(labels, d)]\n",
    "\n",
    "print('#tracks without genre: {}'.format((tracks['track', 'genres'].map(len) == 0).sum()))\n",
    "\n",
    "MAX = 9\n",
    "fig, ax = plt.subplots(figsize=(5, 4))\n",
    "ax.hist(d, bins=np.arange(MAX)-0.5, label=labels)\n",
    "ax.set_xlabel('#genres per track')\n",
    "ax.set_ylabel('#tracks')\n",
    "ax.set_xlim(-0.5, MAX-1.5)\n",
    "ax.set_xticks(range(MAX-1))\n",
    "ax.set_yticklabels(['0'] + ['{}0k'.format(i) for i in range(1, 6)])\n",
    "ax.legend(loc='upper right')\n",
    "fig.tight_layout()\n",
    "fig.savefig('genres_per_track.pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of tracks per genre (full).\n",
    "d = genres[genres['#tracks'] > 2000].sort_values('#tracks', ascending=False)  # Poster: 5000\n",
    "plt.figure(figsize=(10, 4))  # Poster: (7, 4)\n",
    "p = sns.barplot('title', '#tracks', data=d, color='k', alpha=0.4)\n",
    "p.set_xlabel('')\n",
    "p.set_ylabel('#tracks')\n",
    "plt.xticks(rotation=90)\n",
    "plt.tight_layout()\n",
    "plt.savefig('genre_distribution.pdf')\n",
    "\n",
    "genres.loc[genres['#tracks'] > 0, '#tracks'].min(), genres['#tracks'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of tracks per top-level genre (medium).\n",
    "d = tracks[tracks['set', 'subset'] <= 'medium']\n",
    "d = d['track', 'genres_top'].map(lambda x: x[0]).value_counts()\n",
    "d.index = genres.loc[d.index, 'title']\n",
    "plt.figure(figsize=(10, 4))  # Poster: (7, 4)\n",
    "d.plot.bar(color='k', alpha=0.4)\n",
    "plt.ylabel('#tracks')\n",
    "plt.xlabel('')\n",
    "plt.tight_layout()\n",
    "plt.savefig('genre_top_distribution.pdf')\n",
    "d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1 Genre hierarchy\n",
    "\n",
    "* As genres have parent genres, we can plot a tree using the [DOT] language.\n",
    "* Save the full genre tree as a PDF.\n",
    "\n",
    "Todo:\n",
    "* Color nodes according to FMA genre color.\n",
    "* Better looking tree.\n",
    "\n",
    "[DOT]: https://en.wikipedia.org/wiki/DOT_(graph_description_language)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = utils.Genres(genres)\n",
    "graph = g.create_tree([25, 31], 1)\n",
    "ipd.Image(graph.create_png())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph = g.create_tree(14)\n",
    "graph.write_pdf('genre_hierarchy.pdf');\n",
    "\n",
    "roots = g.find_roots()\n",
    "print('{} roots'.format(len(roots)))\n",
    "graph = g.create_tree(roots)\n",
    "graph.write_pdf('genre_hierarchy.pdf');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.2 Cross-appearance\n",
    "\n",
    "Todo:\n",
    "* Group rows and columns for better identification of related genres."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "enc = MultiLabelBinarizer()\n",
    "genres_indicator = enc.fit_transform(tracks['track', 'genres'])\n",
    "genres_names = enc.classes_\n",
    "genres_names = genres.loc[enc.classes_, 'title'].values\n",
    "cross_correlation = genres_indicator.T @ genres_indicator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.fill_diagonal(cross_correlation, 0)\n",
    "\n",
    "plt.figure(figsize=(28, 28))\n",
    "plt.imshow(np.log(cross_correlation))\n",
    "plt.yticks(range(len(genres_names)), genres_names);\n",
    "plt.xticks(range(len(genres_names)), genres_names, rotation=90);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cross_correlation = np.tril(cross_correlation, k=-1)\n",
    "sort = np.argsort(cross_correlation.flatten())\n",
    "\n",
    "N = 20\n",
    "indices = np.unravel_index(sort[:-N:-1], cross_correlation.shape)\n",
    "for i, j in zip(*indices):\n",
    "    print('{}: {} | {}'.format(cross_correlation[i, j], genres_names[i], genres_names[j]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5 Audio\n",
    "\n",
    "Todo: e.g. audio features (echonest / librosa, spectrograms) to show diversity."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6 Features\n",
    "\n",
    "Todo: understand features by listening to segments who have them, e.g. <http://musicinformationretrieval.com/feature_sonification.html>."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.head(5).style.format('{:.2f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.pairplot(features.loc[:, ('mfcc', 'mean', slice('01','03'))]);\n",
    "sns.pairplot(features.loc[:, ('mfcc', 'std', slice('01','03'))]);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7 Echonest features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Echonest features available for {} tracks.'.format(len(echonest)))"
   ]
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 2
}
